Handling image data
Dafnis Krasniqi (cours inspiré de Joaquin Vanschoren)
# kernel and image_patch are n x n matrices
pixel_out = np.sum(kernel * image_patch)

interactive(children=(IntSlider(value=0, description='i_step', max=783), Output()), _dom_classes=('widget-inte…
House numbers photographed from Google streetview imagery, cropped and centered around digits, but with neighboring numbers or other edge artifacts.
For recognizing digits, color is not important, so we grayscale the images
Demonstration
interactive(children=(IntSlider(value=0, description='i_step', max=1023), Output()), _dom_classes=('widget-int…
Demonstration
interactive(children=(FloatSlider(value=0.46, description='frequency', max=1.0, min=0.01, step=0.05), FloatSli…
Demonstration on the streetview data
interactive(children=(FloatSlider(value=0.46, description='frequency', max=1.0, min=0.01, step=0.05), FloatSli…
Another example: Fashion MNIST
Demonstration
interactive(children=(FloatSlider(value=0.46, description='frequency', max=1.0, min=0.01, step=0.05), FloatSli…
Fashion MNIST with multiple filters (filter bank)





pure convnets, one input value spreads to 3x3 nodes of the first layer, 5x5 nodes of the second, etc.Example with Keras:
Conv2D for 2D convolutional layersMaxPooling2D for max-poolingmodel = models.Sequential()
model.add(layers.Conv2D(32, (3, 3), activation='relu',
input_shape=(28, 28, 1)))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Conv2D(64, (3, 3), activation='relu'))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Conv2D(64, (3, 3), activation='relu'))
Observe how the input image on 28x28x1 is transformed to a 3x3x64 feature map
Model: "sequential"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
conv2d (Conv2D) (None, 26, 26, 32) 320
max_pooling2d (MaxPooling2D (None, 13, 13, 32) 0
)
conv2d_1 (Conv2D) (None, 11, 11, 64) 18496
max_pooling2d_1 (MaxPooling (None, 5, 5, 64) 0
2D)
conv2d_2 (Conv2D) (None, 3, 3, 64) 36928
=================================================================
Total params: 55,744
Trainable params: 55,744
Non-trainable params: 0
_________________________________________________________________
Completing the network
model.add(layers.Flatten())
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dense(10, activation='softmax'))
Complete network
Model: "sequential"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
conv2d (Conv2D) (None, 26, 26, 32) 320
max_pooling2d (MaxPooling2D (None, 13, 13, 32) 0
)
conv2d_1 (Conv2D) (None, 11, 11, 64) 18496
max_pooling2d_1 (MaxPooling (None, 5, 5, 64) 0
2D)
conv2d_2 (Conv2D) (None, 3, 3, 64) 36928
flatten (Flatten) (None, 576) 0
dense (Dense) (None, 64) 36928
dense_1 (Dense) (None, 10) 650
=================================================================
Total params: 93,322
Trainable params: 93,322
Non-trainable params: 0
_________________________________________________________________
Run the model on MNIST dataset
Accuracy: 0.988800048828125
Tip:
model.save(os.path.join(model_dir, 'mnist.h5'))
with open(os.path.join(model_dir, 'mnist_history.p'), 'wb') as file_pi:
pickle.dump(history.history, file_pi)
ImageDataGenerator: allows to encode, resize, and rescale JPEG imagestrain_generator = ImageDataGenerator(rescale=1./255).flow_from_directory(
train_dir, # Directory with images
target_size=(150, 150), # Resize images
batch_size=20, # Return 20 images at a time
class_mode='binary') # Binary labels
Since the images are larger and more complex, we add another convolutional layer and increase the number of filters to 128.
model = models.Sequential()
model.add(layers.Conv2D(32, (3, 3), activation='relu',
input_shape=(150, 150, 3)))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Conv2D(64, (3, 3), activation='relu'))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Conv2D(128, (3, 3), activation='relu'))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Conv2D(128, (3, 3), activation='relu'))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Flatten())
model.add(layers.Dense(512, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
Model: "sequential_1"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
conv2d_3 (Conv2D) (None, 148, 148, 32) 896
max_pooling2d_2 (MaxPooling (None, 74, 74, 32) 0
2D)
conv2d_4 (Conv2D) (None, 72, 72, 64) 18496
max_pooling2d_3 (MaxPooling (None, 36, 36, 64) 0
2D)
conv2d_5 (Conv2D) (None, 34, 34, 128) 73856
max_pooling2d_4 (MaxPooling (None, 17, 17, 128) 0
2D)
conv2d_6 (Conv2D) (None, 15, 15, 128) 147584
max_pooling2d_5 (MaxPooling (None, 7, 7, 128) 0
2D)
flatten_1 (Flatten) (None, 6272) 0
dense_2 (Dense) (None, 512) 3211776
dense_3 (Dense) (None, 1) 513
=================================================================
Total params: 3,453,121
Trainable params: 3,453,121
Non-trainable params: 0
_________________________________________________________________
fit function also supports generatorsmodel.compile(loss='binary_crossentropy',
optimizer=optimizers.RMSprop(lr=1e-4),
metrics=['acc'])
history = model.fit(
train_generator, steps_per_epoch=100,
epochs=30, verbose=0,
validation_data=validation_generator,
validation_steps=50)
datagen = ImageDataGenerator(
rotation_range=40, # Rotate image up to 40 degrees
width_shift_range=0.2, # Shift image left-right up to 20% of image width
height_shift_range=0.2,# Shift image up-down up to 20% of image height
shear_range=0.2, # Shear (slant) the image up to 0.2 degrees
zoom_range=0.2, # Zoom in up to 20%
horizontal_flip=True, # Horizontally flip the image
fill_mode='nearest')
Example
We also add Dropout before the Dense layer
model = models.Sequential()
model.add(layers.Conv2D(32, (3, 3), activation='relu',
input_shape=(150, 150, 3)))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Conv2D(64, (3, 3), activation='relu'))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Conv2D(128, (3, 3), activation='relu'))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Conv2D(128, (3, 3), activation='relu'))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Flatten())
model.add(layers.Dropout(0.5))
model.add(layers.Dense(512, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
(Almost) no more overfitting!
Model: "sequential_3"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
conv2d_10 (Conv2D) (None, 148, 148, 32) 896
max_pooling2d_8 (MaxPooling (None, 74, 74, 32) 0
2D)
conv2d_11 (Conv2D) (None, 72, 72, 64) 18496
max_pooling2d_9 (MaxPooling (None, 36, 36, 64) 0
2D)
conv2d_12 (Conv2D) (None, 34, 34, 128) 73856
max_pooling2d_10 (MaxPoolin (None, 17, 17, 128) 0
g2D)
conv2d_13 (Conv2D) (None, 15, 15, 128) 147584
max_pooling2d_11 (MaxPoolin (None, 7, 7, 128) 0
g2D)
flatten_2 (Flatten) (None, 6272) 0
dropout (Dropout) (None, 6272) 0
dense_4 (Dense) (None, 512) 3211776
dense_5 (Dense) (None, 1) 513
=================================================================
Total params: 3,453,121
Trainable params: 3,453,121
Non-trainable params: 0
_________________________________________________________________
layer_outputs = [layer.output for layer in model.layers[:8]]
activation_model = models.Model(inputs=model.input, outputs=layer_outputs)
activations = activation_model.predict(img_tensor)
Output of the first Conv2D layer, 3rd channel (filter):
Output of filter 16:
The same filter responds quite differently for other inputs (green detector?).

from keras import backend as K
input_img = np.random.random((1, size, size, 3)) * 20 + 128.
loss = K.mean(layer_output[:, :, :, filter_index])
grads = K.gradients(loss, model.input)[0] # Compute gradient
for i in range(40): # Run gradient ascent for 40 steps
loss_v, grads_v = K.function([input_img], [loss, grads])
input_img_data += grads_v * step

Let's do this again for the VGG16 network pretrained on ImageNet (much larger)
model = VGG16(weights='imagenet', include_top=False)
Model: "vgg16"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
input_1 (InputLayer) [(None, None, None, 3)] 0
block1_conv1 (Conv2D) (None, None, None, 64) 1792
block1_conv2 (Conv2D) (None, None, None, 64) 36928
block1_pool (MaxPooling2D) (None, None, None, 64) 0
block2_conv1 (Conv2D) (None, None, None, 128) 73856
block2_conv2 (Conv2D) (None, None, None, 128) 147584
block2_pool (MaxPooling2D) (None, None, None, 128) 0
block3_conv1 (Conv2D) (None, None, None, 256) 295168
block3_conv2 (Conv2D) (None, None, None, 256) 590080
block3_conv3 (Conv2D) (None, None, None, 256) 590080
block3_pool (MaxPooling2D) (None, None, None, 256) 0
block4_conv1 (Conv2D) (None, None, None, 512) 1180160
block4_conv2 (Conv2D) (None, None, None, 512) 2359808
block4_conv3 (Conv2D) (None, None, None, 512) 2359808
block4_pool (MaxPooling2D) (None, None, None, 512) 0
block5_conv1 (Conv2D) (None, None, None, 512) 2359808
block5_conv2 (Conv2D) (None, None, None, 512) 2359808
block5_conv3 (Conv2D) (None, None, None, 512) 2359808
block5_pool (MaxPooling2D) (None, None, None, 512) 0
=================================================================
Total params: 14,714,688
Trainable params: 14,714,688
Non-trainable params: 0
_________________________________________________________________
First 64 filters for 1st convolutional layer in block 1: simple edges and colors
Filters in 2nd block of convolution layers: simple textures (combined edges and colors)
Filters in 3rd block of convolution layers: more natural textures
Filters in 4th block of convolution layers: feathers, eyes, leaves,...

Illustration (cats vs dogs)
More realistic example:
model = VGG16(weights='imagenet')

Preprocessing
from keras.applications.vgg16 import preprocess_input
img_path = '../images/10_elephants.jpg'
img = image.load_img(img_path, target_size=(224, 224))
x = image.img_to_array(img)
x = np.expand_dims(x, axis=0) # Transform to batch of size (1, 224, 224, 3)
x = preprocess_input(x)
preds = model.predict(x)
Predicted: [('n02504458', 'African_elephant', 0.90988594), ('n01871265', 'tusker', 0.085724816), ('n02504013', 'Indian_elephant', 0.0043471307)]
Visualize the class activation map
Superimposed on the original image

conv_base = VGG16(weights='imagenet', include_top=False, input_shape=(150, 150, 3))
Model: "vgg16"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
input_2 (InputLayer) [(None, 150, 150, 3)] 0
block1_conv1 (Conv2D) (None, 150, 150, 64) 1792
block1_conv2 (Conv2D) (None, 150, 150, 64) 36928
block1_pool (MaxPooling2D) (None, 75, 75, 64) 0
block2_conv1 (Conv2D) (None, 75, 75, 128) 73856
block2_conv2 (Conv2D) (None, 75, 75, 128) 147584
block2_pool (MaxPooling2D) (None, 37, 37, 128) 0
block3_conv1 (Conv2D) (None, 37, 37, 256) 295168
block3_conv2 (Conv2D) (None, 37, 37, 256) 590080
block3_conv3 (Conv2D) (None, 37, 37, 256) 590080
block3_pool (MaxPooling2D) (None, 18, 18, 256) 0
block4_conv1 (Conv2D) (None, 18, 18, 512) 1180160
block4_conv2 (Conv2D) (None, 18, 18, 512) 2359808
block4_conv3 (Conv2D) (None, 18, 18, 512) 2359808
block4_pool (MaxPooling2D) (None, 9, 9, 512) 0
block5_conv1 (Conv2D) (None, 9, 9, 512) 2359808
block5_conv2 (Conv2D) (None, 9, 9, 512) 2359808
block5_conv3 (Conv2D) (None, 9, 9, 512) 2359808
block5_pool (MaxPooling2D) (None, 4, 4, 512) 0
=================================================================
Total params: 14,714,688
Trainable params: 14,714,688
Non-trainable params: 0
_________________________________________________________________
predict from the convolutional base to build new features
generator = datagen.flow_from_directory(dir, target_size=(150, 150),
batch_size=batch_size, class_mode='binary')
for inputs_batch, labels_batch in generator:
features_batch = conv_base.predict(inputs_batch)
model = models.Sequential()
model.add(layers.Dense(256, activation='relu', input_dim=4 * 4 * 512))
model.add(layers.Dropout(0.5))
model.add(layers.Dense(1, activation='sigmoid'))
Max val_acc 0.90500003
model = models.Sequential()
model.add(conv_base)
model.add(layers.Flatten())
model.add(layers.Dense(256, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
conv_base.trainable = False
We now get about 90% accuracy again, and very little overfitting
Max val_acc 0.906
for layer in conv_base.layers:
if layer.name == 'block5_conv1':
layer.trainable = True
else:
layer.trainable = False
Visualized

model = load_model(os.path.join(model_dir, 'cats_and_dogs_small_3b.h5'))
model.compile(loss='binary_crossentropy',
optimizer=optimizers.RMSprop(lr=1e-5),
metrics=['acc'])
history = model.fit(
train_generator, steps_per_epoch=100, epochs=100,
validation_data=validation_generator,
validation_steps=50)
Almost 95% accuracy. The curves are quite noisy, though.
Max val_acc 0.90800005
Max val_acc 0.9039536851123335